{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "import pandas as pd\n",
    "from bs4 import BeautifulSoup\n",
    "from urllib.parse import urljoin, urlparse\n",
    "import subprocess\n",
    "import os\n",
    "from selenium import webdriver\n",
    "from selenium.webdriver.chrome.options import Options\n",
    "from selenium.webdriver.common.by import By\n",
    "from selenium.webdriver.support.ui import WebDriverWait\n",
    "from selenium.webdriver.support import expected_conditions as EC\n",
    "from webdriver_manager.chrome import ChromeDriverManager\n",
    "from tqdm import tqdm\n",
    "import threading"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    " df_url =  pd.read_csv(\"target_urls.csv\")\n",
    " urls = df_url['url']\n",
    " header={'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.35',\n",
    " 'Referer':'https://cn.bing.com/'}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 获取urls"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_url(url,base):\n",
    "    links=[]\n",
    "    try:\n",
    "        # 发送 GET 请求\n",
    "        response = requests.get(url=url,headers=header)\n",
    "        # print(response)\n",
    "        # 检查响应状态码\n",
    "        if response.status_code == 200:\n",
    "            # 使用 BeautifulSoup 解析 HTML 内容\n",
    "            soup = BeautifulSoup(response.text, 'html.parser')\n",
    "            # 找到页面中的所有链接\n",
    "            links = soup.find_all('a', href=True)\n",
    "            # 筛选域名开头的链接\n",
    "            links = [link.get('href') for link in links if link.get('href') and link.get('href').startswith(base)]\n",
    "        else:\n",
    "            print(f\"无法访问网站：{url}\")\n",
    "    except Exception as e:\n",
    "        print(f\"发生错误：{e}\")\n",
    "    return links"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "451\n"
     ]
    }
   ],
   "source": [
    "res_urls=[]\n",
    "for url in urls:\n",
    "    # 获取目标网站url\n",
    "    targets=get_url(url,url)\n",
    "    res_url=[]\n",
    "    for target in targets:\n",
    "        #获取子网站url\n",
    "        son_url=get_url(target,url)\n",
    "        #列表降维\n",
    "        son_url = [sublist for sublist in son_url]\n",
    "        # 列表拼接\n",
    "        res_url.extend(son_url)\n",
    "        # 列表去重\n",
    "        res_url=list(set(res_url))\n",
    "        if(len(res_url)>300): break\n",
    "    # 元素去重\n",
    "    print(len(res_url))\n",
    "    res_urls.append(res_url)\n",
    "\n",
    "# 创建 DataFrame\n",
    "df = pd.DataFrame({'urls': urls, 'res_urls': res_urls})\n",
    "\n",
    "# 将 DataFrame 保存到 CSV 文件\n",
    "df.to_csv('res_urls.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 捕获流量"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Running as user \"root\" and group \"root\". This could be dangerous.\n",
      "Capturing on 'wlp4s0'\n",
      " ** (tshark:324529) 19:00:28.978709 [Main MESSAGE] -- Capture started.\n",
      " ** (tshark:324529) 19:00:28.978860 [Main MESSAGE] -- File: \"./data_flow/www.3dmgame.com_.pcap\"\n",
      "https://www.3dmgame.com: 100%|██████████| 451/451 [09:39<00:00,  1.29s/suburl]  \n"
     ]
    }
   ],
   "source": [
    "# 创建保存流量的目录\n",
    "data_flow_dir = \"./pcap_data\"\n",
    "os.makedirs(data_flow_dir, exist_ok=True)\n",
    "\n",
    "# 设置Chrome选项\n",
    "chrome_options = Options()\n",
    "# chrome_options.add_argument(\"--headless\")  # 如果你不需要浏览器界面，可以添加这个选项\n",
    "\n",
    "# 初始化Chrome浏览器\n",
    "driver_path = \"./driver/chromedriver\"\n",
    "driver = webdriver.Chrome(executable_path=driver_path, options=chrome_options)\n",
    "wait = WebDriverWait(driver, 20)  # 设置显式等待，最长等待时间为20秒\n",
    "\n",
    "\n",
    "# 定义一个函数用于等待页面完全加载\n",
    "def wait_for_page_load(driver):\n",
    "    wait.until(lambda d: d.execute_script(\"return document.readyState\") == \"complete\")\n",
    "\n",
    "\n",
    "# 访问每个url\n",
    "for index, row in df.iterrows():\n",
    "    url = row[\"urls\"]\n",
    "    res_urls = row[\"res_urls\"]\n",
    "\n",
    "    # 创建当前URL的流量文件名\n",
    "    file_name = os.path.join(data_flow_dir, url.split(\"//\")[1] + \".pcap\")\n",
    "\n",
    "    # 启动tshark捕获流量\n",
    "    tshark_command = [\n",
    "        \"sudo\",\n",
    "        \"tshark\",\n",
    "        \"-i\",\n",
    "        \"wlp4s0\",\n",
    "        \"-w\",\n",
    "        file_name,\n",
    "        \"-f\",\n",
    "        f'host {url.split(\"//\")[1]}',\n",
    "    ]\n",
    "    tshark_process = subprocess.Popen(tshark_command)\n",
    "\n",
    "    # 访问主url\n",
    "    driver.get(url)\n",
    "    wait_for_page_load(driver)  # 等待页面完全加载\n",
    "\n",
    "    with tqdm(total=len(res_urls), desc=url, unit=\"suburl\") as pbar:\n",
    "        # 访问子url\n",
    "        for res_url in res_urls:\n",
    "            driver.get(res_url)\n",
    "            wait_for_page_load(driver)  # 等待页面完全加载\n",
    "            pbar.update(1)\n",
    "\n",
    "    tshark_process.kill()\n",
    "\n",
    "driver.quit()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 解析pcap为json\n",
    "运行脚本./pcap_to_json.sh"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
